In [1]:
from IPython.display import HTML
HTML('''<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js "></script><script>
code_show=true; 
function code_toggle() {
if (code_show){
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').hide();
} else {
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').show();
}
code_show = !code_show
} 
$( document ).ready(code_toggle);</script><form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')
Out[1]:
In [2]:
%%HTML
<script src="require.js"></script>
In [3]:
#!pip install pyquadkey2
In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default='notebook'

from pyquadkey2.quadkey import TileAnchor, QuadKey
from shapely.geometry import Point, Polygon, MultiPolygon
import geopandas as gpd
import folium
from shapely import wkb, wkt
from sklearn.preprocessing import MinMaxScaler
import itertools
from tqdm.notebook import tqdm

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = (SparkSession
     .builder
     .master('local[*]')
     .getOrCreate())
In [5]:
class FigureLabeler:
    """
    Add a figure or table number and caption
    """
    def __init__(self):
        self.fig_num = 1
        self.table_num = 1
    
    def fig_caption(self, title, caption):
        global fig_num
        """Print figure caption on jupyter notebook"""
        display(HTML(
            f"""<p style="font-size:12px;font-style:default;"><b>
            Figure {self.fig_num}. {title}.</b><br>{caption}</p>"""))
        self.fig_num += 1

    def table_caption(self, title, caption):
        global table_num
        """Print table caption on jupyter notebook"""
        display(HTML(
            f"""<p style="font-size:12px;font-style:default;"><b>
            Table {self.table_num}. {title}.</b><br>{caption}</p>""")
               )
        self.table_num += 1
    
    def reset_to(self, fig_num=None, table_num=None):
        """Manually reset figure number or table number."""
        if fig_num is not None:
            self.fig_num = fig_num
        if table_num is not None:
            self.table_num = table_num
            


def qk_to_bbox(q):
    """Return the Rectangle patch anchor, height, and width of a quadkey"""
    nw = QuadKey(q).to_geo(anchor=TileAnchor.ANCHOR_NW)
    se = QuadKey(q).to_geo(anchor=TileAnchor.ANCHOR_SE)
    return ((nw[1], nw[0]), np.abs(se[1]-nw[1]), np.abs(se[0]-nw[0]))

def qk_to_tile(q):
    """Return the tile (x, y) of a quadkey"""
    return QuadKey(q).to_tile()[0]


def partition_df(df, zoom, connection_type, year, quarter, scaling=False):
    """Return a zoom-summarized DataFrame of a given type, year, and quarter"""
    p_df = (df.filter((df['type'] == connection_type) &
                      (df['year'] == year) &
                      (df['quarter'] == quarter))
            .withColumn(f'level_{zoom}', substring('quadkey', 1, zoom))
            .groupby(f'level_{zoom}')
            .agg(mean('avg_d_kbps').alias('avg_download_kbps'),
                 mean('avg_u_kbps').alias('avg_upload_kbps'),
                 mean('avg_lat_ms').alias('avg_latency_ms'),
                 mean('tests').alias('avg_tests'),
                 sum('tests').alias('total_tests'),
                 mean('devices').alias('avg_devices'),
                 sum('devices').alias('total_devices')
                 )
            .sort(f'level_{zoom}')
            .toPandas()
            )

    p_df['bbox'] = p_df[f'level_{zoom}'].apply(qk_to_bbox)
    p_df = p_df.set_index(f'level_{zoom}')

    if scaling is True:
        scaler = MinMaxScaler()
        p_df = pd.concat([p_df,
                          pd.DataFrame(scaler.fit_transform(p_df.iloc[:, :-1]),
                                       index=p_df.index,
                                       columns=['norm_'+i for i in p_df.columns[:-1]])],
                         axis=1)

    p_df['tile'] = p_df['bbox'].apply(lambda x: Polygon([x[0],
                                                         (x[0][0]+x[1], x[0][1]),
                                                         (x[0][0]+x[1],
                                                          x[0][1]+x[2]),
                                                         (x[0][0], x[0][1]+x[2])
                                                         ]))

    p_df = gpd.GeoDataFrame(p_df, geometry='tile').set_crs(epsg=4326)

    return p_df


def compiled_partition_df(df, zoom, scaling=False):
    """Return a complete zoom-summarized Pandas DataFrame"""
    connection_type = ['fixed', 'mobile']
    year = [2019, 2020, 2021, 2022]
    quarter = [1, 2, 3, 4]

    p_df = pd.DataFrame()

    for t, y, q in tqdm(itertools.product(connection_type, year, quarter)):
        new_df = partition_df(df, zoom, t, y, q, scaling=scaling)
        new_df = new_df.assign(type=[t]*len(new_df),
                               year=[y]*len(new_df),
                               quarter=[q]*len(new_df))
        p_df = pd.concat([p_df, new_df])
        
    p_df = p_df.set_index(p_df.reset_index()[f'level_{zoom}'].apply(qk_to_tile))
    p_df['tile'] = p_df['bbox'].apply(lambda x: Polygon([x[0],
                                                         (x[0][0]+x[1], x[0][1]),
                                                         (x[0][0]+x[1],
                                                          x[0][1]+x[2]),
                                                         (x[0][0], x[0][1]+x[2])
                                                         ]))

    p_df = gpd.GeoDataFrame(p_df, geometry='tile').set_crs(epsg=4326)
    return p_df

labeler = FigureLabeler()

Untitled-1.png


EXECUTIVE SUMMARY


Ookla Open Data is a platform provided by Ookla, the company behind Speedtest.net, which offers access to vast global internet performance data. It allows users to explore and analyze broadband and mobile network statistics worldwide. Ookla collects data from millions of devices running their Speedtest application and compiles it into comprehensive datasets. These datasets cover various metrics such as download speed, upload speed, latency, and signal strength across different geographical locations. Ookla Open Data promotes transparency and helps drive data-driven decision-making in internet connectivity. This data was utilized by governments, regulators, internet service providers, and researchers to understand network performance, identify areas for improvement, and make informed policy decisions.

Given this rich source of data, this study compared and analyzed the performance of fixed and mobile network types in various aspects, including kilobits per second (kbps), latency, number of users, and geographic location.

The study determined the areas where fixed networks excel over mobile networks and vice versa, providing insights into these network types' relative strengths and weaknesses. The analysis confirmed that fixed lines performed better than mobile on all aspects, except that the performance of fixed lines are more variable, and ultimately the preference between both would come down to the importance of mobility, upload speed, and availability in the geographic region of the user.

By examining these performance factors, the study recommends that these conclusions can be furthered for a better understanding of the comparative advantages and limitations of fixed and mobile networks if examined in a case-by-case basis taking into account electricity availability, reliability and other different factors that affect connectivity per geographic subregion.

Introduction

Problem Statement

In what aspects (kbps, latency, number of users, and geographic location) do fixed and mobile network types outperform each other?

Motivation

In today's digital age, reliable and high-speed internet access has become essential for social, economic, and educational opportunities. However, significant disparities in internet accessibility and quality persist, hindering the progress of underserved communities. The availability of a comprehensive global speed test dataset presents a unique opportunity to address these challenges and bridge the digital divide. By processing and analyzing this dataset, we can unlock valuable insights that empower individuals, organizations, and policymakers to take targeted actions, improve internet infrastructure, and create a more inclusive digital future.

Data Source

Field Name Type Description
avg_d_kbps Integer The average latency of all tests performed in the tile, represented in milliseconds
avg_u_kbps Integer The average upload speed of all tests performed in the tile, represented in kilobits per second
avg_lat_ms Integer The average latency of all tests performed in the tile, represented in milliseconds
tests Integer The number of tests taken in the tile
devices Integer The number of unique devices contributing tests in the tile
quadkey String The quadkey representing the tile
tile String The geographic WKT format representation of the quadkey tile
type String Whether the speedtest observation is a fixed or mobile connection
year Integer The year the speedtest observation was conducted
quarter Integer The quarter the speedtest observation was conducted
Table 1: Attributes of the Ookla Speedtest Dataset

The Ookla platform generates a vast amount of Speedtest data every month from people who avail of the speedtest on their website, Speedtest.com. This data is readily found on AWS's Registry of Open Data, with documentation provided here. The dataset is saved into parquet files partitioned by type (fixed or mobile connection), year, and quarter, covering the years of 2019 to 2022. A list of features can be seen in Table 1.

The geographic information of each speedtest entry was stored through Well Known Text (WKT) geometry in tiles, following the quadkey system to manage spatial joins effectively. Under this system, the world is subdivided by quarters a certain number of times, or a zoom level (z). The resulting tiles are defined as the quartered fractions of the Earth's width/height according to the Web Mercator projection (EPSG:3857), and their dimensions can be estimated in meters, taking into account slight variations due to latitude.

In the raw dataset, a zoom level (z) of 16 was used for tiling, equivalent to an 18-arcsecond block. This means the projection of the world was subdivided by quarters 16 times. For each speedtest observation, a 16-digit long quadkey is provided, where each digit is an integer from 0 to 3 corresponding to which quarter that tile belongs to at that zoom level (see image below for an example of z = 1 to 3)

Methodology

methodology.png



  1. Data Collection: All parquet files available will be read using Apache Spark into a Spark DataFrame.
  2. Data Preprocessing: The data will be filtered, grouped, and sorted as a Spark DataFrame into more summarized and helpful formats for better analysis.
  3. Data Exploration: The data will be descriptively visualized to compare fixed and mobile performance.
    1. Comparative Summary Statistics: General statistics will be visually plotted for comparison.
    2. Time Series Vizualization: The performances will be compared with respect to time.
    3. Geospatial Vizualization The performances will be compared with respect to space.
  4. Results and Discussion: Based on observations from the data exploration stage, insights and a comparative review summary will be generated.

DATA COLLECTION


The Ookla Speedtest Dataset used in this report came from parquet files that were taken from and made available to the team via the JOJIE public dataset (/mnt/data/public/speedtest) from the Asian Institute of Management, although a copy of this may also be accessed in AWS's Registry of Open Data.

Since the shapefiles in this dataset are tiles (not the exact shape of the country), for geospatial analysis, subregions were taken from Natural Earth Shapefiles, specifically the 1:50m Cultural Vectors at Admin 0 - Countries, which can be accessed here.

Ookla Speedtest Performance

Below is a sample of the raw dataset read as a Spark DataFrame. This data includes all fixed and mobile type connections, all years, and all quarters each speedtest was observed from 2019 to 2022. The WKT tile format is read in string format, but will be converted during preprocessing.

In [6]:
# Reading the files through Spark
df = spark.read.parquet('/mnt/data/public/speedtest/parquet')

# Registering df to catalogue
df.createOrReplaceTempView('performance')

# Displaying a sample of 10
display(df.limit(10).toPandas().head())
labeler.reset_to(table_num=2)
labeler.table_caption('Raw Ookla Speedtest Performance Dataset',
                      'The tile is a Well-Known Text (WKT) shapefile in string format'
                     )
                                                                                
quadkey tile avg_d_kbps avg_u_kbps avg_lat_ms tests devices type year quarter
0 0022133222312322 POLYGON((-160.02685546875 70.6435894914449, -1... 19110 7891 77 3 2 mobile 2022 1
1 0022133222330023 POLYGON((-160.043334960938 70.6363054807905, -... 21870 11875 83 2 1 mobile 2022 1
2 0022133222330032 POLYGON((-160.037841796875 70.6363054807905, -... 14157 14560 75 14 2 mobile 2022 1
3 0022133222330100 POLYGON((-160.02685546875 70.6417687358462, -1... 5468 9886 83 1 1 mobile 2022 1
4 0022133222330102 POLYGON((-160.02685546875 70.6399478155463, -1... 24311 16243 72 1 1 mobile 2022 1

Table 2. Raw Ookla Speedtest Performance Dataset.
The tile is a Well-Known Text (WKT) shapefile in string format

Natural Earth Cultural Vector Shapefiles

Below is the shapefile summary of the Natural Earth Cultural Vector Shapefiles. Each row represents a country's shape in an EPSG:4326 projection of the world.

In [7]:
countries = gpd.read_file('map').set_crs(epsg=4326)

display(countries.head())
labeler.table_caption('Raw Natural Earth Cultural Vector Shapefiles',
                      'The table was coverted into Geopandas'
                     )
ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/share/proj failed
featurecla scalerank LABELRANK SOVEREIGNT SOV_A3 ADM0_DIF LEVEL TYPE TLC ADMIN ... FCLASS_TR FCLASS_ID FCLASS_PL FCLASS_GR FCLASS_IT FCLASS_NL FCLASS_SE FCLASS_BD FCLASS_UA geometry
0 Admin-0 country 1 3 Zimbabwe ZWE 0 2 Sovereign country 1 Zimbabwe ... NaN NaN NaN NaN NaN NaN NaN NaN NaN POLYGON ((31.28789 -22.40205, 31.19727 -22.344...
1 Admin-0 country 1 3 Zambia ZMB 0 2 Sovereign country 1 Zambia ... NaN NaN NaN NaN NaN NaN NaN NaN NaN POLYGON ((30.39609 -15.64307, 30.25068 -15.643...
2 Admin-0 country 1 3 Yemen YEM 0 2 Sovereign country 1 Yemen ... NaN NaN NaN NaN NaN NaN NaN NaN NaN MULTIPOLYGON (((53.08564 16.64839, 52.58145 16...
3 Admin-0 country 3 2 Vietnam VNM 0 2 Sovereign country 1 Vietnam ... NaN NaN NaN NaN NaN NaN NaN NaN NaN MULTIPOLYGON (((104.06396 10.39082, 104.08301 ...
4 Admin-0 country 5 3 Venezuela VEN 0 2 Sovereign country 1 Venezuela ... NaN NaN NaN NaN NaN NaN NaN NaN NaN MULTIPOLYGON (((-60.82119 9.13838, -60.94141 9...

5 rows × 169 columns

Table 3. Raw Natural Earth Cultural Vector Shapefiles.
The table was coverted into Geopandas

For this particular study, the most important columns are the geometry column containing the shape information, and the SUBREGION column containing what region each country belongs to. A full list of the subregions defined by the shapefiles are as follows:

In [8]:
fig, ax = plt.subplots(1, 2, figsize=(12,4), dpi=300)

# World by cultural boundaries
countries.plot(ax=ax[0])
ax[0].set_title('World by Cultural Boundaries')
ax[0].set_xticks([])
ax[0].set_yticks([])

 # World by subregions
temp = countries.dissolve(by='SUBREGION')[['geometry']]
temp['subregion'] = temp.index
temp.plot(ax=ax[1], column='subregion')
ax[1].set_title('World by Subregions')
ax[1].set_xticks([])
ax[1].set_yticks([])

plt.show()
labeler.reset_to(table_num=3)
labeler.fig_caption('Natural Earth Cultural World Map',
                      'Figure shows the world divided by boundaries and subregion,'
                    ' according to Natural Earth Cultural boundaries.'
                     )

Figure 1. Natural Earth Cultural World Map.
Figure shows the world divided by boundaries and subregion, according to Natural Earth Cultural boundaries.

Subregion Number of Countries
Caribbean 25
Eastern Africa 19
Western Asia 19
Western Africa 17
Southern Europe 16
Northern Europe 15
South America 13
South-Eastern Asia 11
Southern Asia 10
Eastern Europe 10
Polynesia 9
Western Europe 9
Middle Africa 9
Eastern Asia 8
Central America 8
Micronesia 7
Northern Africa 7
Northern America 5
Seven seas (open ocean) 5
Melanesia 5
Southern Africa 5
Central Asia 5
Australia and New Zealand 4
Antarctica 1
Table 4: Geographic Subregions Based on Natural Earth Shapefiles

DATA PREPROCESSING


In this section, the team prepared, cleaned, and preprocessed the collected dataset to ensure its quality and suitability for further analysis. This involved creating two main tables for the ease of later analysis:

  1. Summary Statistics Table: A table getting the means, variances, and sums of each field per year, quarter, and type.
  2. Geospatial Table: A table getting the means and sums of each field per tile at z = 7 for every year, quarter, and type, with an additional field, subregion, based on intersections with subregion shapes from the Natural Earth dataset.

Summary Statistics Table

Using Spark SQL, the Summary Statistics Table was generated statistics for analysis. A preview can be found below:

In [9]:
# Creating summary statistics dataframe
df_avg  = (spark.sql(
            """
            SELECT 
                year,
                quarter,
                type,
                AVG(avg_d_kbps) AS average_download_kbps,
                AVG(avg_u_kbps) AS average_upload_kbps,
                AVG(avg_lat_ms) AS average_latency,
                VAR_POP(avg_d_kbps) AS variance_download_kbps,
                VAR_POP(avg_u_kbps) AS variance_upload_kbps,
                VAR_POP(avg_lat_ms) AS variance_latency,
                SUM(devices) AS total_devices,
                AVG(devices) AS avg_devices,
                SUM(tests) AS total_tests,
                AVG(tests) AS avg_tests
            FROM performance
            GROUP BY year, quarter, type;
            """)).toPandas().sort_values(by='year')

# Feature engineering additional fields
df_avg['continuous_quarter'] = df_avg['year'] + df_avg['quarter'] / 4
df_avg['speed'] = (df_avg['average_download_kbps'] + 
                   df_avg['average_upload_kbps']) / 2000
df_avg.sort_values(by='continuous_quarter', inplace=True)

display(df_avg.head())
labeler.reset_to(table_num=5)
labeler.table_caption('Processed Summary Statistics Table',
                      'The table shows the computed statistics using spark sql.'
                      ' This is further enhanced by adding feature engineered fields'
                     )
                                                                                
year quarter type average_download_kbps average_upload_kbps average_latency variance_download_kbps variance_upload_kbps variance_latency total_devices avg_devices total_tests avg_tests continuous_quarter speed
13 2019 1 fixed 50879.295999 21231.945033 41.564853 3.607452e+09 1.356275e+09 17734.652032 20877133 4.280701 69942821 14.341256 2019.25 36.055621
12 2019 1 mobile 25139.942852 9392.571670 51.504502 7.852244e+08 8.258373e+07 3192.574972 10618097 3.286070 21592933 6.682543 2019.25 17.266257
4 2019 2 mobile 25936.895617 9544.628041 50.556830 8.777544e+08 8.589149e+07 3240.006467 10495087 3.142064 21234141 6.357168 2019.50 17.740762
5 2019 2 fixed 52623.905091 22332.541222 41.797200 3.869277e+09 1.493373e+09 16689.769756 20615874 4.244102 68716230 14.146317 2019.50 37.478223
25 2019 3 mobile 28449.075827 10025.894767 48.731881 1.155145e+09 9.198146e+07 2895.101348 14067596 3.506218 28273464 7.046899 2019.75 19.237485

Table 5. Processed Summary Statistics Table.
The table shows the computed statistics using spark sql. This is further enhanced by adding feature engineered fields

Geospatial Table

The map data was pre-processed by creating a boolean column to check if the tile intersects a subregion. A preview of the geospatial table can be found below. Because some tiles can intersect multiple subregions, we will find that some observations will not be mutually exclusive during geospatial analysis.

In [10]:
# Creating a complete geopandas df (all types, years, and quarters) with zoom=7
# 2 types x 4 years x 4 quarters = 32 scans
df_geo = compiled_partition_df(df, 7)

# Create boolean columns of whether a tile intersects a subregion
for region, geom in countries.dissolve(by='SUBREGION')['geometry'].items():
    df_geo[region] = df_geo.intersects(geom)

# Feature engineering additional fields
df_geo['year_quarter'] = 'Q' + df_geo['quarter'].astype(str) + ' ' + df_geo['year'].astype(str)
df_geo['speed'] = (df_geo['avg_download_kbps'] + df_geo['avg_upload_kbps']) / 2000

display(df_geo.head())
labeler.table_caption('Processed Geospatial Table',
                      'The table preserves the tiles WKT shape data with an additional subregion column.'
                      ' The index of the DataFrame refers to the x,y tile it belongs to in the quadkeys system'
                      ' at zoom level 7 (16 digit quadkeys grouped according to its first 7 digits)'
                     )
0it [00:00, ?it/s]
                                                                                
avg_download_kbps avg_upload_kbps avg_latency_ms avg_tests total_tests avg_devices total_devices bbox tile type ... South America South-Eastern Asia Southern Africa Southern Asia Southern Europe Western Africa Western Asia Western Europe year_quarter speed
level_7
(7, 27) 8421.500000 3552.500000 43.250000 1.250000 5 1.250000 5 ((-160.3125, 71.524909037328), 2.8125, 0.91229... POLYGON ((-160.31250 71.52491, -157.50000 71.5... fixed ... False False False False False False False False Q1 2019 5.987000
(4, 30) 3444.333333 3164.333333 46.666667 2.000000 6 1.333333 4 ((-168.75, 68.656554984757), 2.8125, 1.0473343... POLYGON ((-168.75000 68.65655, -165.93750 68.6... fixed ... False False False False False False False False Q1 2019 3.304333
(6, 30) 772.000000 2797.000000 40.000000 4.000000 4 4.000000 4 ((-163.125, 68.656554984757), 2.8125, 1.047334... POLYGON ((-163.12500 68.65655, -160.31250 68.6... fixed ... False False False False False False False False Q1 2019 1.784500
(6, 31) 15211.750000 20763.250000 34.833333 2.750000 33 1.333333 16 ((-163.125, 67.609220604964), 2.8125, 1.095960... POLYGON ((-163.12500 67.60922, -160.31250 67.6... fixed ... False False False False False False False False Q1 2019 17.987500
(8, 27) 6676.400000 5286.666667 41.533333 2.266667 34 1.400000 21 ((-157.5, 71.524909037328), 2.8125, 0.91229479... POLYGON ((-157.50000 71.52491, -154.68750 71.5... fixed ... False False False False False False False False Q1 2019 5.981533

5 rows × 38 columns

Table 6. Processed Geospatial Table.
The table preserves the tiles WKT shape data with an additional subregion column. The index of the DataFrame refers to the x,y tile it belongs to in the quadkeys system at zoom level 7 (16 digit quadkeys grouped according to its first 7 digits)


DATA EXPLORATION


The data is now ready for exploration. The following figures in this section provide the most interesting comparative visualizations between fixed and mobile data performance through Comparative Summary Statistics, Time Series Visualizations, and Geospatial Visualizations.

Comparative Summary Statistics

In this section, summary statistics pertaining to general fixed and mobile connection performance without respect to time or space are focused on.

In [11]:
# Convert kbps to Mbps
df_geo['avg_download_mbps'] = df_geo['avg_download_kbps'] / 1000
df_geo['avg_upload_mbps'] = df_geo['avg_upload_kbps'] / 1000

# Plot download speed vs upload speed with linear trendline
fig = px.scatter(df_geo,
                 x='avg_download_mbps',
                 y='avg_upload_mbps',
                 color='type',
                 animation_frame='year_quarter',
                 range_x=(-1, 500),
                 range_y=(-1, 300),
                 trendline='ols'
                )

# Demarcate Mbps line
fig.add_vline(x=100,
              line_width=1,
              line_dash='dot',
              line_color='gray',
              annotation_text="100 Mbps",
              annotation_position="top right")

fig.update_layout(yaxis_title='Average Upload Speed (Mbps)',
                  xaxis_title='Average Download Speed (Mbps)',
                  title='Average Download Speed in Mbps through Time',
                  template='plotly_white')

fig.show()

labeler.fig_caption('Average Download Speed in Mbps through Time',
                      ''
                     )

Figure 2. Average Download Speed in Mbps through Time.

Despite significant advancements in mobile communications, particularly with the introduction of 5G technology, fixed lines still tend to outperform mobile networks in general due to the limitations on the transfer speed and the number of concurrent users. However, it is worth noting that mobile networks have made significant progress and can now match the download speeds offered by fixed lines. Additionally, the performance of mobile networks is generally more consistent and reliable than their fixed-line counterparts, which can be prone to fluctuations in performance.

Furthermore, an interesting observation from this comparison is the availability of Burst Speed, which refers to the maximum data transfer rate that a device or connection can achieve. This Burst Speed depends on seasonality, as indicated by a pulsating graph. When considering a broadband or prepaid plan, it is advisable to focus on the sustained speed rather than the maximum speed, as there may be a significant difference between the two. While a connection may boast a high maximum speed, the actual sustained speed experienced regularly might be lower. While fixed lines generally offer better performance than mobile networks due to their inherent limitations, mobile networks have made significant progress and can now match download speeds. Mobile networks also provide more consistent performance, and when selecting a plan, it is crucial to consider the sustained speed rather than just the maximum speed to ensure a satisfactory user experience.

Time Series Visualizations

The visualizations on this section focus on changes in fixed and mobile performance over time.

In [12]:
fig = go.Figure()

# Add scatter trace for 'fixed' type
fig.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'fixed']['continuous_quarter'], 
                         y=df_avg[df_avg['type'] == 'fixed']['total_devices'], 
                         mode='lines+markers+text', name='Fixed'))

# Add scatter trace for 'mobile' type
fig.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'mobile']['continuous_quarter'], 
                         y=df_avg[df_avg['type'] == 'mobile']['total_devices'], 
                         mode='lines+markers+text', name='Mobile'))

# Update x-axis labels
ticktext = ['Q{} {}'.format(q, y) for q, y in zip(df_avg['quarter'],
                                                  df_avg['year'])]
fig.update_layout(xaxis=dict(tickmode='array', tickvals=df_avg['continuous_quarter'],
                             ticktext=ticktext))

# Set axis labels and title
fig.update_layout(xaxis_title='Quarter and Year',
                  yaxis_title='Number of Devices',
                  template='plotly_white',
                  title='Number of Devices surveyed by Type')
fig.show()

labeler.fig_caption('Number of Devices surveyed by Type',
                      ''
                     )

Figure 3. Number of Devices surveyed by Type.

As displayed in this graph, more consumers are now aware to measure their internet speed. This peaked during the pandemic period (2020) and started to dip on Q4 2020. This need only reflected on fixed lines not on mobile platforms.

In [13]:
fig = go.Figure()

# Add scatter trace for 'fixed' type
fig.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'fixed']['continuous_quarter'], 
                         y=df_avg[df_avg['type'] == 'fixed']['total_tests'], 
                         mode='lines+markers+text', name='Fixed'))

# Add scatter trace for 'mobile' type
fig.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'mobile']['continuous_quarter'], 
                         y=df_avg[df_avg['type'] == 'mobile']['total_tests'], 
                         mode='lines+markers+text', name='Mobile'))

# Update x-axis labels
ticktext = ['Q{} {}'.format(q, y) for q, y in zip(df_avg['quarter'], df_avg['year'])]
fig.update_layout(xaxis=dict(tickmode='array',
                             tickvals=df_avg['continuous_quarter'],
                             ticktext=ticktext))

# Set axis labels and title
fig.update_layout(xaxis_title='Quarter and Year',
                  yaxis_title='Frequency of Tests',
                  template='plotly_white',
                  title='Number of tests performed')
fig.show()

labeler.fig_caption('Number of tests performed',
                      ''
                     )

Figure 4. Number of tests performed.

Similar to the previous statistics, this reflected the need of people to have a reliable internet connection particularly during pandemic where most work and school activities are done online. This need only reflected on fixed lines not on mobile platforms.

In [14]:
fig1 = go.Figure()
fig2 = go.Figure()

# FIG 1
# Add scatter trace for 'fixed' type
fig1.add_trace(go.Scatter(x=(df_avg[df_avg['type'] == 'fixed']['continuous_quarter']), 
                          y=df_avg[df_avg['type'] == 'fixed']['speed'], 
                          mode='lines+markers+text', name='Fixed',
                          line=dict(color='blue'), 
                          marker=dict(color='blue')))

# Add scatter trace for 'mobile' type
fig1.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'mobile']['continuous_quarter'], 
                          y=df_avg[df_avg['type'] == 'mobile']['speed'], 
                          mode='lines+markers+text', name='Mobile',
                          line=dict(color='red'), marker=dict(color='red')))

# Update x-axis labels
ticktext = ['Q{} {}'.format(q, y) for q, y in zip(df_avg['quarter'],
                                                  df_avg['year'])]
fig1.update_layout(xaxis=dict(tickmode='array',
                              tickvals=df_avg['continuous_quarter'],
                              ticktext=ticktext))
# Set axis labels and title
fig1.update_layout(xaxis_title='Quarter and Year', yaxis_title='Latency (ms)', title='Average Latency by Type')



# FIG 2
# Add scatter trace for 'fixed' type
fig2.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'fixed']['continuous_quarter'], 
                          y=df_avg[df_avg['type'] == 'fixed']['average_latency'], 
                          mode='lines+markers+text', name='Fixed',
                          line=dict(color='blue'), marker=dict(color='blue'),
                          showlegend=False))

# Add scatter trace for 'mobile' type
fig2.add_trace(go.Scatter(x=df_avg[df_avg['type'] == 'mobile']['continuous_quarter'], 
                          y=df_avg[df_avg['type'] == 'mobile']['average_latency'], 
                          mode='lines+markers+text', name='Mobile',
                          line=dict(color='red'), marker=dict(color='red'),
                          showlegend=False))

# Update x-axis labels
ticktext = ['Q{} {}'.format(q, y) for q, y in zip(df_avg['quarter'], df_avg['year'])]
fig2.update_layout(xaxis=dict(tickmode='array', tickvals=df_avg['continuous_quarter'], ticktext=ticktext))
# Set axis labels and title
fig2.update_layout(xaxis_title='Quarter and Year',
                   yaxis_title='Latency (ms)',
                   title='Average Latency by Type',
                   template='plotly_white')


fig = make_subplots(rows=2, cols=1)
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig1.data[1], row=1, col=1)
fig.add_trace(fig2.data[0], row=2, col=1)
fig.add_trace(fig2.data[1], row=2, col=1)


fig.update_yaxes(title_text="Average Speed (mbps) Higher is better", row=1, col=1)
fig.update_yaxes(title_text="Average Latency (ms) Lower is better", row=2, col=1)
fig.update_xaxes(visible=False, row=2, col=1)
fig.update_layout(height=700, #width=1000, 
                  title_text="Average Speed and Latency Comparison",
                  template='plotly_white',
                  xaxis=dict(tickmode='array',
                             tickvals=df_avg['continuous_quarter'],
                             ticktext=ticktext))

fig.show()
labeler.reset_to(fig_num=6)
labeler.fig_caption('Average Speed and Latency Comparison', ' ')

Figure 6. Average Speed and Latency Comparison.

This graph shows that internet services are improving in general. Average speed is increasing over time and average latency is decreaing over time for both connection types. Fixed connections also unanimously outperform mobile connections.

In [15]:
display(HTML(f"<center>{df_avg.groupby('type')[['average_latency', 'variance_latency']].mean().to_html()}</center>"))
labeler.reset_to(table_num=7)
labeler.table_caption('Average Mean and Variance Latency Comparison', ' ')
average_latency variance_latency
type
fixed 31.128697 8056.123697
mobile 43.199709 2439.105923

Table 7. Average Mean and Variance Latency Comparison.

However, despite being faster with lower latency, fixed lines experience much more variability in terms of performance, as shown in table 7.

In [16]:
# Create histogram of average latency per zoom=7 tile
fig = px.histogram(df_geo, x="avg_latency_ms",
                   color='type',
                   barmode='overlay',
                   marginal='rug',
                   height=700,
                   range_y=(0, 350)
                  )

# Demarkate 1 second line
fig.add_vline(x=1000,
              line_width=1,
              line_dash='dot',
              line_color='gray',
              annotation_text="1 second latency",
              annotation_position="top right")

# Demarkate median latency line
fig.add_vline(x=df_geo['avg_latency_ms'].quantile(0.5),
              line_width=1,
              line_dash='dot',
              line_color='white',
              annotation_text="median latency\n(46ms)",
              annotation_position="top right")

fig.update_layout(xaxis_title='Average Latency (ms)',
                  yaxis_title='Count',
                  title='Histogram of Average Latency (in milliseconds)',
                  template='plotly_white')

fig.show()

labeler.reset_to(fig_num=7)
labeler.fig_caption('Histogram of Average Latency (in milliseconds)',
                    '')

Figure 7. Histogram of Average Latency (in milliseconds).

Visualizing the distribution of fixed and mobile connections, we can see that the median latency of all speedtests are 46 milliseconds, way less that the one second mark. Despite this, there is a bump in latency between 500ms and 1000ms, or 0.5 to 1 seconds. This bump is present in both fixed and mobile connections.

Unique to fixed connections, however, is the spread of extreme latency cases. Mobile connections will max out at the 2000ms or 2 second mark, but a few fixed connections will exceed this mark all the way to 4000ms or 4 seconds.

Geospatial Visualizations

The visualizations on this section focus on changes in fixed and mobile performance over time.

In [17]:
# Transform geospatial dataset to calculate number of devices per region
subregion_df = pd.DataFrame()
for region in countries['SUBREGION'].unique():
    subregion_df[region] = df_geo.groupby([region, 'type'])[
        'total_devices'].sum(numeric_only=True)[(True)]

subregion_df = (pd.melt(subregion_df.T.reset_index(),
                        id_vars='index',
                        value_vars=['fixed', 'mobile'],
                        var_name='type', value_name='total_devices')
                .rename(columns={'index': 'SUBREGION'}))

subregion_df = (subregion_df.assign(all_devices=subregion_df.
                                    groupby('SUBREGION')
                                    .transform('sum', numeric_only=True))
                .sort_values('all_devices', ascending=True))

# Plot barplot of total devices per region (some tiles overlap many regions)
fig = px.bar(subregion_df.iloc[-20:],
             x='total_devices',
             y='SUBREGION',
             color='type',
             text_auto='.2s',
             orientation='h',
             barmode='group',
             height=700
             )

fig.update_layout(xaxis_title='Subregion',
                  yaxis_title='Count of Total Devices',
                  title='Total Number of Speedtested Devices per Subregion (2019-2022)',
                  template='plotly_white')

fig.show()

labeler.reset_to(fig_num=8)
labeler.fig_caption('Total Number of Speedtested Devices per Subregion (2019-2022)',
                    '')

Figure 8. Total Number of Speedtested Devices per Subregion (2019-2022).

When dividing by subregion, we can see that Northern America's fixed connection devices surpass any other region by far, measuring 120 million total devices from 2019 to 2022. Northern America contributes the most devices in general to Speedtest.com. Mobile connection devices lag behind considerably. Interesting to note, however, is that the Asia subregion runner-ups (South-Eastern, Southern, and Eastern Asia) have less of a gap between total fixed and mobile devices.

In [18]:
# Transform geospatial dataset to calculate % fixed devices per tile
percent_df_f = partition_df(df, 7, 'fixed', 2022, 4)
percent_df_m = partition_df(df, 7, 'mobile', 2022, 4)

percent_df = percent_df_f[['total_devices',
                           'tile']].merge(percent_df_m[['total_devices',
                                                        'tile']],
                                          how='outer',
                                          on='tile',
                                          suffixes=['_fixed', '_mobile']
                                          )

percent_df['total_devices'] = percent_df[[
    'total_devices_fixed', 'total_devices_mobile']].sum(axis=1)
percent_df['% fixed'] = (
    percent_df['total_devices_fixed'] / percent_df['total_devices']) * 100
percent_df = percent_df.drop(['total_devices_fixed',
                              'total_devices_mobile',
                              'total_devices',
                              ], axis=1)

percent_df = percent_df.fillna(0)
percent_df['% fixed'] = percent_df['% fixed'].astype(int)

# Get shape intersection of country shapes and tile shapes
res_union = percent_df.overlay(countries, how='intersection')

# Plot choropleth of % fixed users
fig = px.choropleth(res_union,
                    geojson=res_union.geometry,
                    locations=res_union.index,
                    color='% fixed',
                    color_continuous_scale='RdYlBu',
                    hover_data=['SUBREGION', 'SOVEREIGNT'],
                    height=500
                   )

fig.update_geos(fitbounds="locations", visible=True)
fig.update_layout(
    title_text='Percent of Fixed Connection Devices over Total Devices (Q4 2022)'
)
fig.update(layout = dict(title=dict(x=0.5)))
fig.update_layout(
    margin={"r":0,"t":30,"l":0,"b":10},
    coloraxis_colorbar={
        'title':'% Fixed'})
fig.update_traces(marker_line_width=0)

fig.show()

labeler.fig_caption('Percent of Fixed Connection Devices over Total Devices (Q4 2022)',
                    'A greater percentage of fixed connection (blue tiles) '
                    'means fixed devices dominate mobile. A tile with less '
                    'fixed connections (red tiles) means mobile dominates fixed.')
                                                                                

Figure 9. Percent of Fixed Connection Devices over Total Devices (Q4 2022).
A greater percentage of fixed connection (blue tiles) means fixed devices dominate mobile. A tile with less fixed connections (red tiles) means mobile dominates fixed.

When dividing by tiles, we can better see the preference of the entire American continent (both North and South) for fixed connection devices over mobile ones, with little to no mobile connection devices in the Northern-most parts of Northern America, found in Canada.

As for the other regions, mobile devices are more preferred in Western, Northern, and Eastern Africa. The exception to this is the tip of South Africa. This is consistent with Africa's Mobile Market, which expects half the population in Sub-Saharan Africa will subscribe to mobile services by 2025 (GSMA, 2019).

Eastern, Southern, and South-Eastern Asia overall have a more even preference for speedtesting mobile and fixed devices, like India. Although there are exceptions to this, such as countries like China which have more preference for fixed connections, and countries like Iran which have more preference for mobile connections.

This percentage may only suggest what devices are more prevalently used for personal entertainment and online services in these countries, but are not a strong indicator of whether these devices are actually physically more prevalent in these countries as these are just devices that choose to take Speedtest.com's test.

In [19]:
# Transform geospatial dataset to calculate avg download speed per per tile
speed_df_f = partition_df(df, 7, 'fixed', 2022, 4)
speed_df_m = partition_df(df, 7, 'mobile', 2022, 4)

speed_df = speed_df_f[['avg_download_kbps',
                       'tile']].merge(speed_df_m[['avg_download_kbps',
                                                  'tile']],
                                      how='outer',
                                      on='tile')

speed_df['avg_download_kbps'] = speed_df[['avg_download_kbps_x',
                                          'avg_download_kbps_y']].mean(axis=1)
speed_df = speed_df.drop(['avg_download_kbps_x',
                          'avg_download_kbps_y'], axis=1)

# Get shape intersection of country shapes and tile shapes
res_union = speed_df.overlay(countries, how='intersection')

# Plot choropleth of average speed
fig = px.choropleth(res_union,
                    geojson=res_union.geometry,
                    locations=res_union.index,
                    color='avg_download_kbps',
                    color_continuous_scale='viridis',
                    hover_data=['SUBREGION', 'SOVEREIGNT'],
                    height=500
                   )

fig.update_geos(fitbounds="locations", visible=True)
fig.update_layout(
    title_text='Average Download Speed (Fixed and Mobile) (Q4 2022)'
)
fig.update(layout = dict(title=dict(x=0.5)))
fig.update_layout(
    margin={"r":0,"t":30,"l":0,"b":10},
    coloraxis_colorbar={
        'title':'kbps'})
fig.update_traces(marker_line_width=0)

fig.show()

labeler.fig_caption('Average Download Speed (Fixed and Mobile) (Q4 2022)',
                    '')
                                                                                

Figure 10. Average Download Speed (Fixed and Mobile) (Q4 2022).

Geographic location does make a difference to download speed. As of the latest data, Q4 2022, we can see that China is alight with the fastest download speed (both fixed and mobile) in the world. North-Eastern Asia, Northern America, parts of Australia, and Western Europe follow suit. The color patterns that emerged in Figure 9 do not match with Figure 10. Having a dominantly fixed connection location does not necessarily dictate that it will also have faster download speeds.


RESULTS AND DISCUSSION


After going through the data, we generalize our findings to the following insights:

  1. Connection speed is improving overall. Over time, both fixed and mobile connections have been improving. From 2019 to 2022, latency has been decreasing and speed increasing. While mobile lags behind fixed in all aspects, it continues to share these improvement trends with fixed connections as connection infrastructures continue to scale and improve. This improvement was also unaffected by the uptake of device users in the 2020s due to COVID-19 lockdown conditions.

  2. Mobile connections are more consistent. Despite falling behind, mobile connection speedtests had less variance in their latency at up to around 2 seconds, compared to fixed connections, which span up to 4 seconds in some extreme cases. Mobile connections also follow a more linear trend in its download-to-upload speed, where every increase in download speed leads to only a small, but more consistent rise in upload speed. In the case of fixed connections, cases of fast upload but slow download speeds are as common as the opposite, with a less apparent trendline.

  3. Connection speed is geographically disparate. Seen in Figure 10 above, most of the world is shrouded in slower download speeds regardless of the connection type that dominates that location. For example, both Northern and Southern America subregions have high preference towards fixed connections (Figure 9), but Southern America has overall slower speeds that Northern America (Figure 10). Faster speeds are dependent on the quality of the connection infrastructure available to that location, regardless if it is fixed or mobile.


CONCLUSION


We begain this comparative review by asking: in what aspects (kbps, latency, number of users, and geographic location) do fixed and mobile network types outperform each other?

After exploring Ookla's speedtest data on fixed and mobile connections, we can now conclude the following:

Criterion Winner
Faster Download Speed (kbps) Fixed
Faster Upload Speed (kbps) Fixed
Lower Latency (ms) Fixed
Greaeter Number of Users (total devices) Fixed
Geographic Location Fixed/Mobile

Does this mean that fixed connections are superior? Not necessarily. We discovered that fixed connections also have a larger variance in latency and download-to-upload speeds. The benefit of a fixed connection is supposedly to lower latency, however there are instances where latency extends to as long as 4 seconds that are not observed in mobile connections. While upload speeds in fixed connections are superior, for download speeds, mobile connections stand on par.

The difference in preference lies in your need for mobility (as fixed is a stationary connection), upload speed (which is generally better in fixed connections), and the availability of either service in your geographic location. While most of the Americas, Western Europe, and China, among others prefer to speedtest fixed devices, countries found in Africa prefer mobile, and countries found in Southern and Eastern Asia like India evenly prefer the two. The quality of either type may vary depending on what is most popular in your area.

Despite this, overall, we can see that regardless of connection type, connections in general are improving. Both fixed and mobile connections are following a trend of decreasing average latency and increasing average download and upload speed over time (from 2019 to 2022), regardless of the uptake in devices in 2020 due to COVID-19 lockdowns. The internet looks to only get faster from here.


RECOMMENDATIONS


By examining trends and factors such as download speeds, upload speeds, latency, and signal strength across different geographical locations and periods, researchers can uncover patterns and identify areas for improvement. This analysis can be particularly informative for users who must make informed decisions when choosing between mobile or fixed-line connections.

However, it is essential to acknowledge certain limitations of the dataset. Firstly, the data is acquired through voluntary testing, which may only partially represent part of the population. Users who run the speedtest application often do so with a specific purpose, which can introduce a self-selection bias that impacts the study's representativeness.

Additionally, the dataset is gathered only when there is a working line internet connection, or when mobile users being within the coverage of a 4G or 5G tower. Therefore, this study does not consider factors such as internet disconnections, power outages, and hardware specifications.

Despite being informative to consumers, due to the limitations, it is recommended to conduct a proper data gathering to support evidence-based decision-making for policy formulation, network planning, and comparative analysis.


REFERENCES


  • Speedtest by Ookla (n.d.). Speedtest by Ookla Global Fixed and Mobile Network Performance Map Tiles. GitHub. Retrieved May 19, 2023, from https://github.com/teamookla/ookla-open-data#readme
  • Brundritt, R., Munk, S., French, C., & Cai, S. (2022, June 9). Bing Maps Tile System. Microsoft. Retrieved May 19, 2023, from https://learn.microsoft.com/en-us/bingmaps/articles/bing-maps-tile-system
  • Natural Earth (n.d.). 1:50m Cultural Vectors. Retrieved May 19, 2023, from https://www.naturalearthdata.com/downloads/50m-cultural-vectors/
  • GSMA Intelligence (n.d.). The Mobile Economy: Sub-Saharan Africa 2019. GSMAi. Retrieved May 19, 2023, from https://data.gsmaintelligence.com/api-web/v2/research-file-download?id=45121567&file=2794-160719-ME-SSA.pdf